Expression level of putative TFs

Putative TFs are determined by genes with GO terms

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Load data

Fs_LRT.1000 <-
  readr::read_csv("data/Fs_LRT_1000.csv")
## Rows: 1000 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): GeneID
## dbl (6): baseMean, log2FoldChange, lfcSE, stat, pvalue, padj
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Fd_LRT.1000 <-
  readr::read_csv("data/Fd_LRT_1000.csv")
## Rows: 1000 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): GeneID
## dbl (6): baseMean, log2FoldChange, lfcSE, stat, pvalue, padj
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Load PES

Non-transformed

Fd_PES <-
  readr::read_csv(file = "data/Fd_PES.csv")
## Rows: 7907 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): GeneID
## dbl (9): PS, gamete, E1, E2, E3, E4, E5, E6, matSP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Fd_PES_F <-
#   readr::read_csv(file = "data/Fd_PES_F.csv")
# Fd_PES_M <-
#   readr::read_csv(file = "data/Fd_PES_M.csv")
Fs_PES <-
  readr::read_csv(file = "data/Fs_PES.csv")
## Rows: 8291 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): GeneID
## dbl (8): PS, gamete, 24H, 48H, 1w, 3w, 4w, matSP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Fs_PES_F <-
#   readr::read_csv(file = "data/Fs_PES_F.csv")
# Fs_PES_M <-
#   readr::read_csv(file = "data/Fs_PES_M.csv")
Ec_PES_32m <-
  readr::read_csv(file = "data/Ec_PES_32m.csv")
## Rows: 11571 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): GeneID
## dbl (10): PS, meiospore, immGA, matGA, oldGA, gamete, earlyPSP, immPSP, matP...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Ec_PES_25f <-
  readr::read_csv(file = "data/Ec_PES_25f.csv")
## Rows: 11571 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): GeneID
## dbl (10): PS, meiospore, immGA, matGA, oldGA, gamete, earlyPSP, immPSP, matP...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

sqrt-tranformed

Fd_PES.sqrt <-
  readr::read_csv(file = "data/Fd_PES.sqrt.csv")
## Rows: 7907 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): GeneID
## dbl (9): PS, gamete, E1, E2, E3, E4, E5, E6, matSP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Fd_PES_F.sqrt <-
#   readr::read_csv(file = "data/Fd_PES_F.sqrt.csv")
# Fd_PES_M.sqrt <-
#   readr::read_csv(file = "data/Fd_PES_M.sqrt.csv")
Fs_PES.sqrt <-
  readr::read_csv(file = "data/Fs_PES.sqrt.csv")
## Rows: 8291 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): GeneID
## dbl (8): PS, gamete, 24H, 48H, 1w, 3w, 4w, matSP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Fs_PES_F.sqrt <-
#   readr::read_csv(file = "data/Fs_PES_F.sqrt.csv")
# Fs_PES_M.sqrt <-
#   readr::read_csv(file = "data/Fs_PES_M.sqrt.csv")
Ec_PES_32m.sqrt <-
  readr::read_csv(file = "data/Ec_PES_32m.sqrt.csv")
## Rows: 11571 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): GeneID
## dbl (10): PS, meiospore, immGA, matGA, oldGA, gamete, earlyPSP, immPSP, matP...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Ec_PES_25f.sqrt <-
  readr::read_csv(file = "data/Ec_PES_25f.sqrt.csv")
## Rows: 11571 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): GeneID
## dbl (10): PS, meiospore, immGA, matGA, oldGA, gamete, earlyPSP, immPSP, matP...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

log2-tranformed

Fd_PES.log2 <-
  readr::read_csv(file = "data/Fd_PES.log2.csv")
## Rows: 7907 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): GeneID
## dbl (9): PS, gamete, E1, E2, E3, E4, E5, E6, matSP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Fd_PES_F.log2 <-
#   readr::read_csv(file = "data/Fd_PES_F.log2.csv")
# Fd_PES_M.log2 <-
#   readr::read_csv(file = "data/Fd_PES_M.log2.csv")
Fs_PES.log2 <-
  readr::read_csv(file = "data/Fs_PES.log2.csv")
## Rows: 8291 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): GeneID
## dbl (8): PS, gamete, 24H, 48H, 1w, 3w, 4w, matSP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Fs_PES_F.log2 <-
#   readr::read_csv(file = "data/Fs_PES_F.log2.csv")
# Fs_PES_M.log2 <-
#   readr::read_csv(file = "data/Fs_PES_M.log2.csv")
Ec_PES_32m.log2 <-
  readr::read_csv(file = "data/Ec_PES_32m.log2.csv")
## Rows: 11571 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): GeneID
## dbl (10): PS, meiospore, immGA, matGA, oldGA, gamete, earlyPSP, immPSP, matP...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Ec_PES_25f.log2 <-
  readr::read_csv(file = "data/Ec_PES_25f.log2.csv")
## Rows: 11571 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): GeneID
## dbl (10): PS, meiospore, immGA, matGA, oldGA, gamete, earlyPSP, immPSP, matP...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

To avoid Error in vroom_(file, delim = delim %||% col_types$delim, col_names = col_names, : bad value , I load the packages later.

library(ggfortify)
library(tximport)
library(myTAI)
IPS_Fs <- 
        read_delim("data/Interproscan/Fs_ips/F_serratus_pep.filt.processed.faa.tsv",
                    delim = "\t", col_select = c(1,4,12:14)) %>%
        separate_rows(GO_term, sep = "\\|") %>% 
        dplyr::mutate(GeneID = str_remove(GeneID, "\\.[^.]+$")) %>%
        distinct()
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 247635 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): GeneID, Analysis, InterPro_accession, InterPro_description, GO_term
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
IPS_Fd <- 
        read_delim("data/Interproscan/Fd_ips/F_distichus_pep.filt.processed.faa.tsv",
                    delim = "\t", col_select = c(1,4,12:14)) %>%
        separate_rows(GO_term, sep = "\\|") %>% 
        dplyr::mutate(GeneID = str_remove(GeneID, "\\.[^.]+$")) %>%
        distinct()
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 161271 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): GeneID, Analysis, InterPro_accession, InterPro_description, GO_term
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
IPS_Ec <- 
        read_delim("data/Interproscan//Ec_ips/Ectocarpus-sp7.filt.processed.faa.tsv",
                    delim = "\t", col_select = c(1,4,12:14)) %>%
        separate_rows(GO_term, sep = "\\|") %>% 
        dplyr::mutate(GeneID = str_remove(GeneID, "\\.[^.]+$")) %>%
        distinct()
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 224135 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): GeneID, Analysis, InterPro_accession, InterPro_description, GO_term
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
IPS_Ec_full <- 
        read_delim("data/Interproscan//Ec_ips/Ectocarpus-sp7.filt.processed.faa.tsv",
                    delim = "\t", col_select = c(1,4,12:14)) %>%
        separate_rows(GO_term, sep = "\\|") %>% 
        # dplyr::mutate(GeneID = str_remove(GeneID, "\\.[^.]+$")) %>%
        distinct()
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 224135 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): GeneID, Analysis, InterPro_accession, InterPro_description, GO_term
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Naive analysis of GO terms for transcription factors.

  • GO:0006355 (regulation of DNA-templated transcription)
  • GO:0003700 (DNA-binding transcription factor activity)
  • GO:0140110 (transcription regulator activity)
  • GO:0006351 (DNA-templated transcription)
  • GO:0051090 (regulation of DNA-binding transcription factor activity)
  • GO:0001217 (DNA-binding transcription repressor activity)
  • GO:0051091 (positive regulation of DNA-binding transcription factor activity)
  • GO:0043433 (negative regulation of DNA-binding transcription factor activity)
# ECTOCARPUS
# 248 genes selected
Ec_TF_list <- IPS_Ec %>% select(GeneID, GO_term) %>% dplyr::filter(grepl("GO:0006355|GO:0003700|GO:0140110|GO:0006351|GO:0006351|GO:0051090|GO:0001217|GO:0051091|GO:0043433",GO_term)) %>% select(GeneID) %>% distinct()
# For full gene name (315 genes selected)
Ec_TF_full_list <- IPS_Ec_full %>% select(GeneID, GO_term) %>% dplyr::filter(grepl("GO:0006355|GO:0003700|GO:0140110|GO:0006351|GO:0006351|GO:0051090|GO:0001217|GO:0051091|GO:0043433",GO_term)) %>% select(GeneID) %>% distinct()

# FUCUS
# 269 genes selected
Fs_TF_list <- IPS_Fs %>% select(GeneID, GO_term) %>% dplyr::filter(grepl("GO:0006355|GO:0003700|GO:0140110|GO:0006351|GO:0006351|GO:0051090|GO:0001217|GO:0051091|GO:0043433",GO_term)) %>% select(GeneID) %>% distinct()
# 199 genes selected
Fd_TF_list <- IPS_Fd %>% select(GeneID, GO_term) %>% dplyr::filter(grepl("GO:0006355|GO:0003700|GO:0140110|GO:0006351|GO:0006351|GO:0051090|GO:0001217|GO:0051091|GO:0043433",GO_term)) %>% select(GeneID) %>% distinct()

Q1: how many of the putative TFs are in the DEG.LRT list in Fucus?

Fs_TF_list %>%
        dplyr::mutate(LRT = GeneID %in% Fs_LRT.1000$GeneID) %>%
        ggplot2::ggplot(aes(x = "Putative TFs",fill = LRT)) +
        ggplot2::geom_bar(position = position_stack(), colour = "black") +
        ggplot2::scale_fill_manual(values = c("#DC0000FF", "white")) +
        ggplot2::theme_classic() +
        ggplot2::labs(x = "", title = "Most putative TFs are not part of LRT",
                      subtitle = "Fucus serratus") +
        ggplot2::coord_flip()

Fd_TF_list %>%
        dplyr::mutate(LRT = GeneID %in% Fd_LRT.1000$GeneID) %>%
        ggplot2::ggplot(aes(x = "Putative TFs",fill = LRT)) +
        ggplot2::geom_bar(position = position_stack(), colour = "black") +
        ggplot2::scale_fill_manual(values = c("#DC0000FF", "white")) +
        ggplot2::theme_classic() +
        ggplot2::labs(x = "", title = "Most putative TFs are not part of LRT",
                      subtitle = "Fucus distichus") +
        ggplot2::coord_flip()

What else may the LRT.1000 genes be?

IPS_Ec %>%
        dplyr::filter(GeneID %in% Ec_TF_list$GeneID) %>% 
        dplyr::select(-c(Analysis, InterPro_accession,GO_term))
IPS_Fs %>%
        dplyr::filter(GeneID %in% Fs_TF_list$GeneID) %>% 
        dplyr::select(-c(Analysis, InterPro_accession,GO_term))
IPS_Fd %>%
        dplyr::filter(GeneID %in% Fd_TF_list$GeneID) %>% 
        dplyr::select(-c(Analysis, InterPro_accession,GO_term))
Fs_PES %>%
        dplyr::select(-c(gamete,matSP)) %>%
        myTAI::CollapseReplicates(nrep = 5, FUN = mean, stage.names = "MeanExpression") %>%
        dplyr::mutate(PutativeTF = GeneID %in% Fs_TF_list$GeneID) %>%
        ggplot2::ggplot(aes(MeanExpression, fill = PutativeTF)) +
        ggplot2::scale_fill_manual(values = c("#DC0000FF", "white")) +
        ggplot2::geom_histogram() +
        ggplot2::labs(title = "Putative TFs expression (both axes sqrt-scaled)",
                      subtitle = "Fucus serratus") +
        ggplot2::scale_x_sqrt() +
        ggplot2::scale_y_sqrt()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Fd_PES %>%
        dplyr::select(-c(gamete,matSP)) %>%
        myTAI::CollapseReplicates(nrep = 6, FUN = mean, stage.names = "MeanExpression") %>%
        dplyr::mutate(PutativeTF = GeneID %in% Fd_TF_list$GeneID) %>%
        ggplot2::ggplot(aes(MeanExpression, fill = PutativeTF)) +
        ggplot2::scale_fill_manual(values = c("#DC0000FF", "white")) +
        ggplot2::geom_histogram() +
        ggplot2::labs(title = "Putative TFs expression (both axes sqrt-scaled)",
                      subtitle = "Fucus distichus") +
        ggplot2::scale_x_sqrt() +
        ggplot2::scale_y_sqrt()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Ec_PES_25f %>%
        myTAI::CollapseReplicates(nrep = 9, FUN = mean, stage.names = "MeanExpression") %>%
        dplyr::mutate(PutativeTF = GeneID %in% Ec_TF_full_list$GeneID) %>%
        ggplot2::ggplot(aes(MeanExpression, fill = PutativeTF)) +
        ggplot2::scale_fill_manual(values = c("#DC0000FF", "white")) +
        ggplot2::geom_histogram() +
        ggplot2::labs(title = "Putative TFs expression (both axes sqrt-scaled)",
                      subtitle = "Ectocarpus female") +
        ggplot2::scale_x_sqrt() +
        ggplot2::scale_y_sqrt()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Ec_PES_32m %>%
        myTAI::CollapseReplicates(nrep = 9, FUN = mean, stage.names = "MeanExpression") %>%
        dplyr::mutate(PutativeTF = GeneID %in% Ec_TF_full_list$GeneID) %>%
        ggplot2::ggplot(aes(MeanExpression, fill = PutativeTF)) +
        ggplot2::scale_fill_manual(values = c("#DC0000FF", "white")) +
        ggplot2::geom_histogram() +
        ggplot2::labs(title = "Putative TFs expression (both axes sqrt-scaled)",
                      subtitle = "Ectocarpus male") +
        ggplot2::scale_x_sqrt() +
        ggplot2::scale_y_sqrt()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Q2: What about with a more restricted list?

Naive analysis of GO terms for transcription factors.

This is used in Piasecka et al. 2013

# ECTOCARPUS
# 209 genes selected
Ec_TF_list <- IPS_Ec %>% select(GeneID, GO_term) %>% dplyr::filter(grepl("GO:0006355",GO_term)) %>% select(GeneID) %>% distinct()
# For full gene name (269 genes selected)
Ec_TF_full_list <- IPS_Ec_full %>% select(GeneID, GO_term) %>% dplyr::filter(grepl("GO:0006355",GO_term)) %>% select(GeneID) %>% distinct()

# FUCUS
# 213 genes selected
Fs_TF_list <- IPS_Fs %>% select(GeneID, GO_term) %>% dplyr::filter(grepl("GO:0006355",GO_term)) %>% select(GeneID) %>% distinct()
# 162 genes selected
Fd_TF_list <- IPS_Fd %>% select(GeneID, GO_term) %>% dplyr::filter(grepl("GO:0006355",GO_term)) %>% select(GeneID) %>% distinct()
Fs_TF_list %>%
        dplyr::mutate(LRT = GeneID %in% Fs_LRT.1000$GeneID) %>%
        ggplot2::ggplot(aes(x = "Putative TFs",fill = LRT)) +
        ggplot2::geom_bar(position = position_stack(), colour = "black") +
        ggplot2::scale_fill_manual(values = c("#DC0000FF", "white")) +
        ggplot2::theme_classic() +
        ggplot2::labs(x = "", title = "Most putative TFs are not part of LRT",
                      subtitle = "Fucus serratus") +
        ggplot2::coord_flip()

Fd_TF_list %>%
        dplyr::mutate(LRT = GeneID %in% Fd_LRT.1000$GeneID) %>%
        ggplot2::ggplot(aes(x = "Putative TFs",fill = LRT)) +
        ggplot2::geom_bar(position = position_stack(), colour = "black") +
        ggplot2::scale_fill_manual(values = c("#DC0000FF", "white")) +
        ggplot2::theme_classic() +
        ggplot2::labs(x = "", title = "Most putative TFs are not part of LRT",
                      subtitle = "Fucus distichus") +
        ggplot2::coord_flip()

What else may the LRT.1000 genes be?

IPS_Ec %>%
        dplyr::filter(GeneID %in% Ec_TF_list$GeneID) %>% 
        dplyr::select(-c(Analysis, InterPro_accession,GO_term))
IPS_Fs %>%
        dplyr::filter(GeneID %in% Fs_TF_list$GeneID) %>% 
        dplyr::select(-c(Analysis, InterPro_accession,GO_term))
IPS_Fd %>%
        dplyr::filter(GeneID %in% Fd_TF_list$GeneID) %>% 
        dplyr::select(-c(Analysis, InterPro_accession,GO_term))
Fs_PES %>%
        dplyr::select(-c(gamete,matSP)) %>%
        myTAI::CollapseReplicates(nrep = 5, FUN = mean, stage.names = "MeanExpression") %>%
        dplyr::mutate(PutativeTF = GeneID %in% Fs_TF_list$GeneID) %>%
        ggplot2::ggplot(aes(MeanExpression, fill = PutativeTF)) +
        ggplot2::scale_fill_manual(values = c("#DC0000FF", "white")) +
        ggplot2::geom_histogram() +
        ggplot2::labs(title = "Putative TFs expression (both axes sqrt-scaled)",
                      subtitle = "Fucus serratus") +
        ggplot2::scale_x_sqrt() +
        ggplot2::scale_y_sqrt()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Fd_PES %>%
        dplyr::select(-c(gamete,matSP)) %>%
        myTAI::CollapseReplicates(nrep = 6, FUN = mean, stage.names = "MeanExpression") %>%
        dplyr::mutate(PutativeTF = GeneID %in% Fd_TF_list$GeneID) %>%
        ggplot2::ggplot(aes(MeanExpression, fill = PutativeTF)) +
        ggplot2::scale_fill_manual(values = c("#DC0000FF", "white")) +
        ggplot2::geom_histogram() +
        ggplot2::labs(title = "Putative TFs expression (both axes sqrt-scaled)",
                      subtitle = "Fucus distichus") +
        ggplot2::scale_x_sqrt() +
        ggplot2::scale_y_sqrt()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Ec_PES_25f %>%
        myTAI::CollapseReplicates(nrep = 9, FUN = mean, stage.names = "MeanExpression") %>%
        dplyr::mutate(PutativeTF = GeneID %in% Ec_TF_full_list$GeneID) %>%
        ggplot2::ggplot(aes(MeanExpression, fill = PutativeTF)) +
        ggplot2::scale_fill_manual(values = c("#DC0000FF", "white")) +
        ggplot2::geom_histogram() +
        ggplot2::labs(title = "Putative TFs expression (both axes sqrt-scaled)",
                      subtitle = "Ectocarpus female") +
        ggplot2::scale_x_sqrt() +
        ggplot2::scale_y_sqrt()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Ec_PES_32m %>%
        myTAI::CollapseReplicates(nrep = 9, FUN = mean, stage.names = "MeanExpression") %>%
        dplyr::mutate(PutativeTF = GeneID %in% Ec_TF_full_list$GeneID) %>%
        ggplot2::ggplot(aes(MeanExpression, fill = PutativeTF)) +
        ggplot2::scale_fill_manual(values = c("#DC0000FF", "white")) +
        ggplot2::geom_histogram() +
        ggplot2::labs(title = "Putative TFs expression (both axes sqrt-scaled)",
                      subtitle = "Ectocarpus male") +
        ggplot2::scale_x_sqrt() +
        ggplot2::scale_y_sqrt()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

This leads to essentially the same results.

Q3: how does the TAI now look like once these genes are filtered out?

Fs_pval_TFs <-
        Fs_PES %>%
        dplyr::select(-c(gamete,matSP)) %>%
        dplyr::filter(GeneID %in% Fs_TF_list$GeneID) %>%
        myTAI::tfStability(transforms = c("none", "sqrt", "log2", "rank"),
                           permutations = 1000)
## 
## Total runtime of your permutation test: 0.006  seconds.
## Total runtime of your permutation test: 0.006  seconds.
## Total runtime of your permutation test: 0.004  seconds.
## Total runtime of your permutation test: 0.004  seconds.
Fd_pval_TFs <-
        Fd_PES %>%
        dplyr::select(-c(gamete,matSP)) %>%
        dplyr::filter(GeneID %in% Fd_TF_list$GeneID) %>%
        myTAI::tfStability(transforms = c("none", "sqrt", "log2", "rank"),
                           permutations = 1000)
## 
## Total runtime of your permutation test: 0.016  seconds.
## Total runtime of your permutation test: 0.004  seconds.
## Total runtime of your permutation test: 0.004  seconds.
## Total runtime of your permutation test: 0.004  seconds.
Fs_PES_tS_res <- 
        Fs_pval_TFs %>%
        tibble::as_tibble(rownames = "transformation") %>%
        dplyr::rename("pval" = value) %>%
        dplyr::mutate(test = "FlatLineTest")
Fd_PES_tS_res <- 
        Fd_pval_TFs %>%
        tibble::as_tibble(rownames = "transformation") %>%
        dplyr::rename("pval" = value) %>%
        dplyr::mutate(test = "FlatLineTest")

Fs_PES_tS_res %>%
  ggplot2::ggplot(
    aes(
      y = -log10(pval), 
      x = transformation)) +
  ggplot2::geom_col(width = 0.05) +
  ggplot2::geom_point(size = 5) +
  ggplot2::geom_hline(
    yintercept = -log10(0.05), 
    colour = "#E64B35FF",
    linetype = 'dashed',
    size = 2
    ) +
  ggplot2::labs(
    title = "Fucus serratus",
    subtitle = "FlatLineTest",
    x = "Transformation"
  ) +
  ggplot2::coord_flip() +
  ggplot2::theme_classic()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Fd_PES_tS_res %>%
  ggplot2::ggplot(
    aes(
      y = -log10(pval), 
      x = transformation)) +
  ggplot2::geom_col(width = 0.05) +
  ggplot2::geom_point(size = 5) +
  ggplot2::geom_hline(
    yintercept = -log10(0.05), 
    colour = "#E64B35FF",
    linetype = 'dashed',
    size = 2
    ) +
  ggplot2::labs(
    title = "Fucus distichus",
    subtitle = "FlatLineTest",
    x = "Transformation"
  ) +
  ggplot2::coord_flip() +
  ggplot2::theme_classic()

Fs_PES %>%
        dplyr::select(-c(gamete,matSP)) %>%
        dplyr::filter(GeneID %in% Fs_TF_list$GeneID) %>%
        myTAI::PlotSignature(main = "Fucus distichus",measure = "TAI",xlab = "TAI")
## Plot signature: ' TAI ' and test statistic: ' FlatLineTest ' running  1000  permutations.
## 
## Total runtime of your permutation test: 0.006  seconds.
## Significance status of signature:  not significant (= no evolutionary signature in the transcriptome).

Fd_PES %>%
        dplyr::select(-c(gamete,matSP)) %>%
        dplyr::filter(GeneID %in% Fd_TF_list$GeneID) %>%
        myTAI::PlotSignature(main = "Fucus distichus",measure = "TAI",xlab = "TAI")
## Plot signature: ' TAI ' and test statistic: ' FlatLineTest ' running  1000  permutations.
## 
## Total runtime of your permutation test: 0.005  seconds.
## Significance status of signature:  not significant (= no evolutionary signature in the transcriptome).

Fs_PES.sqrt %>%
        dplyr::select(-c(gamete,matSP)) %>%
        dplyr::filter(GeneID %in% Fs_TF_list$GeneID) %>%
        myTAI::PlotSignature(main = "Fucus distichus",measure = "TAI",xlab = "TAI")
## Plot signature: ' TAI ' and test statistic: ' FlatLineTest ' running  1000  permutations.
## 
## Total runtime of your permutation test: 0.005  seconds.
## Significance status of signature:  not significant (= no evolutionary signature in the transcriptome).

Fd_PES.sqrt %>%
        dplyr::select(-c(gamete,matSP)) %>%
        dplyr::filter(GeneID %in% Fd_TF_list$GeneID) %>%
        myTAI::PlotSignature(main = "Fucus distichus",measure = "TAI",xlab = "TAI")
## Plot signature: ' TAI ' and test statistic: ' FlatLineTest ' running  1000  permutations.
## 
## Total runtime of your permutation test: 0.005  seconds.
## Significance status of signature:  not significant (= no evolutionary signature in the transcriptome).

Damn… Only 96 genes are retained in F. serratus (~1.2%) and 108 in F. distichus (~1.4%).

The flat line is not such an issue for Ectocarpus

Ec_M_pval_TFs <-
        Ec_PES_32m %>%
        dplyr::filter(GeneID %in% Ec_TF_full_list$GeneID) %>%
        myTAI::tfStability(transforms = c("none", "sqrt", "log2", "rank"),
                           permutations = 1000)
## 
## Total runtime of your permutation test: 0.006  seconds.
## Total runtime of your permutation test: 0.006  seconds.
## Total runtime of your permutation test: 0.005  seconds.
## Total runtime of your permutation test: 0.005  seconds.
Ec_F_pval_TFs <-
        Ec_PES_25f %>%
        dplyr::filter(GeneID %in% Ec_TF_full_list$GeneID) %>%
        myTAI::tfStability(transforms = c("none", "sqrt", "log2", "rank"),
                           permutations = 1000)
## 
## Total runtime of your permutation test: 0.005  seconds.
## Total runtime of your permutation test: 0.005  seconds.
## Total runtime of your permutation test: 0.004  seconds.
## Total runtime of your permutation test: 0.005  seconds.
Ec_F_PES_tS_res <- 
        Ec_F_pval_TFs %>%
        tibble::as_tibble(rownames = "transformation") %>%
        dplyr::rename("pval" = value) %>%
        dplyr::mutate(test = "FlatLineTest")
Ec_M_PES_tS_res <- 
        Ec_M_pval_TFs %>%
        tibble::as_tibble(rownames = "transformation") %>%
        dplyr::rename("pval" = value) %>%
        dplyr::mutate(test = "FlatLineTest")

Ec_F_PES_tS_res %>%
  ggplot2::ggplot(
    aes(
      y = -log10(pval), 
      x = transformation)) +
  ggplot2::geom_col(width = 0.05) +
  ggplot2::geom_point(size = 5) +
  ggplot2::geom_hline(
    yintercept = -log10(0.05), 
    colour = "#E64B35FF",
    linetype = 'dashed',
    size = 2
    ) +
  ggplot2::labs(
    title = "Ectocarpus female",
    subtitle = "FlatLineTest",
    x = "Transformation"
  ) +
  ggplot2::coord_flip() +
  ggplot2::theme_classic()

Ec_M_PES_tS_res %>%
  ggplot2::ggplot(
    aes(
      y = -log10(pval), 
      x = transformation)) +
  ggplot2::geom_col(width = 0.05) +
  ggplot2::geom_point(size = 5) +
  ggplot2::geom_hline(
    yintercept = -log10(0.05), 
    colour = "#E64B35FF",
    linetype = 'dashed',
    size = 2
    ) +
  ggplot2::labs(
    title = "Ectocarpus male",
    subtitle = "FlatLineTest",
    x = "Transformation"
  ) +
  ggplot2::coord_flip() +
  ggplot2::theme_classic()

Ec_PES_32m %>%
        dplyr::filter(GeneID %in% Ec_TF_full_list$GeneID) %>%
        myTAI::PlotSignature(main = "Ectocarpus male",measure = "TAI",xlab = "TAI")
## Plot signature: ' TAI ' and test statistic: ' FlatLineTest ' running  1000  permutations.
## 
## Total runtime of your permutation test: 0.007  seconds.
## Significance status of signature:  significant.

Ec_PES_25f %>%
        dplyr::filter(GeneID %in% Ec_TF_full_list$GeneID) %>%
        myTAI::PlotSignature(main = "Ectocarpus female",measure = "TAI",xlab = "TAI")
## Plot signature: ' TAI ' and test statistic: ' FlatLineTest ' running  1000  permutations.
## 
## Total runtime of your permutation test: 0.006  seconds.
## Significance status of signature:  not significant (= no evolutionary signature in the transcriptome).

Ec_PES_32m.sqrt %>%
        dplyr::filter(GeneID %in% Ec_TF_full_list$GeneID) %>%
        myTAI::PlotSignature(main = "Ectocarpus male",measure = "TAI",xlab = "TAI")
## Plot signature: ' TAI ' and test statistic: ' FlatLineTest ' running  1000  permutations.
## 
## Total runtime of your permutation test: 0.005  seconds.
## Significance status of signature:  significant.

Ec_PES_25f.sqrt %>%
        dplyr::filter(GeneID %in% Ec_TF_full_list$GeneID) %>%
        myTAI::PlotSignature(main = "Ectocarpus female",measure = "TAI",xlab = "TAI")
## Plot signature: ' TAI ' and test statistic: ' FlatLineTest ' running  1000  permutations.
## 
## Total runtime of your permutation test: 0.005  seconds.
## Significance status of signature:  significant.

Ec_PES_32m.log2 %>%
        dplyr::filter(GeneID %in% Ec_TF_full_list$GeneID) %>%
        myTAI::PlotSignature(main = "Ectocarpus male",measure = "TAI",xlab = "TAI")
## Plot signature: ' TAI ' and test statistic: ' FlatLineTest ' running  1000  permutations.
## 
## Total runtime of your permutation test: 0.005  seconds.
## Significance status of signature:  significant.

Ec_PES_25f.log2 %>%
        dplyr::filter(GeneID %in% Ec_TF_full_list$GeneID) %>%
        myTAI::PlotSignature(main = "Ectocarpus female",measure = "TAI",xlab = "TAI")
## Plot signature: ' TAI ' and test statistic: ' FlatLineTest ' running  1000  permutations.
## 
## Total runtime of your permutation test: 0.005  seconds.
## Significance status of signature:  significant.

I think these patterns are interesting but the interpretability is undermined by the fact that only ~168 genes are employed here. This less than 5% of genes, woefully ignoring the entire transcriptome.

Q4: what is the PS of putative TFs?

Fs_PES %>%
        dplyr::select(PS,GeneID) %>%
        dplyr::mutate(PutativeTF = GeneID %in% Fs_TF_list$GeneID) %>%
        ggplot2::ggplot(aes(as.factor(PS), fill = PutativeTF)) +
        ggplot2::scale_fill_manual(values = c("#DC0000FF", "white")) +
        ggplot2::geom_bar() +
        ggplot2::labs(title = "Putative TFs PS",
                      subtitle = "Fucus serratus",
                      x = "PS", y = "Number of genes (log-scaled)") +
        ggplot2::scale_y_log10()

Fd_PES %>%
        dplyr::select(PS,GeneID) %>%
        dplyr::mutate(PutativeTF = GeneID %in% Fd_TF_list$GeneID) %>%
        ggplot2::ggplot(aes(as.factor(PS), fill = PutativeTF)) +
        ggplot2::scale_fill_manual(values = c("#DC0000FF", "white")) +
        ggplot2::geom_bar() +
        ggplot2::labs(title = "Putative TFs PS",
                      subtitle = "Fucus distichus",
                      x = "PS", y = "Number of genes (log-scaled)") +
        ggplot2::scale_y_log10()

Ec_PES_25f %>%
        dplyr::select(PS,GeneID) %>%
        dplyr::mutate(PutativeTF = GeneID %in% Ec_TF_full_list$GeneID) %>%
        ggplot2::ggplot(aes(as.factor(PS), fill = PutativeTF)) +
        ggplot2::scale_fill_manual(values = c("#DC0000FF", "white")) +
        ggplot2::geom_bar() +
        ggplot2::labs(title = "Putative TFs PS",
                      subtitle = "Ectocarpus",
                      x = "PS", y = "Number of genes (log-scaled)") +
        ggplot2::scale_y_log10()

Putative TFs are distributed across several PS.

Fs_PES %>%
    dplyr::select(PS,GeneID) %>%
    dplyr::mutate(PutativeTF = GeneID %in% Fs_TF_list$GeneID) %>%
    dplyr::group_by(PutativeTF) %>%
    dplyr::summarise(mean(PS))
Fs_PES_true <- Fs_PES %>%
    dplyr::filter(GeneID %in% Fs_TF_list$GeneID)
Fs_PES_false <- Fs_PES %>%
    dplyr::filter(!GeneID %in% Fs_TF_list$GeneID)
ks.test(Fs_PES_true$PS,Fs_PES_false$PS)
## Warning in ks.test.default(Fs_PES_true$PS, Fs_PES_false$PS): p-value will be
## approximate in the presence of ties
## 
##  Asymptotic two-sample Kolmogorov-Smirnov test
## 
## data:  Fs_PES_true$PS and Fs_PES_false$PS
## D = 0.085279, p-value = 0.4951
## alternative hypothesis: two-sided
Fd_PES %>%
    dplyr::select(PS,GeneID) %>%
    dplyr::mutate(PutativeTF = GeneID %in% Fd_TF_list$GeneID) %>%
    dplyr::group_by(PutativeTF) %>%
    dplyr::summarise(mean(PS))
Fd_PES_true <- Fd_PES %>%
    dplyr::filter(GeneID %in% Fd_TF_list$GeneID)
Fd_PES_false <- Fd_PES %>%
    dplyr::filter(!GeneID %in% Fd_TF_list$GeneID)
ks.test(Fd_PES_true$PS,Fd_PES_false$PS)
## Warning in ks.test.default(Fd_PES_true$PS, Fd_PES_false$PS): p-value will be
## approximate in the presence of ties
## 
##  Asymptotic two-sample Kolmogorov-Smirnov test
## 
## data:  Fd_PES_true$PS and Fd_PES_false$PS
## D = 0.14818, p-value = 0.01859
## alternative hypothesis: two-sided
Ec_PES_25f %>%
    dplyr::select(PS,GeneID) %>%
    dplyr::mutate(PutativeTF = GeneID %in% Ec_TF_full_list$GeneID) %>%
    dplyr::group_by(PutativeTF) %>%
    dplyr::summarise(mean(PS))
Ec_PES_true <- Ec_PES_25f %>%
    dplyr::filter(GeneID %in% Ec_TF_full_list$GeneID)
Ec_PES_false <- Ec_PES_25f %>%
    dplyr::filter(!GeneID %in% Ec_TF_full_list$GeneID)
ks.test(Ec_PES_true$PS,Ec_PES_false$PS)
## Warning in ks.test.default(Ec_PES_true$PS, Ec_PES_false$PS): p-value will be
## approximate in the presence of ties
## 
##  Asymptotic two-sample Kolmogorov-Smirnov test
## 
## data:  Ec_PES_true$PS and Ec_PES_false$PS
## D = 0.10315, p-value = 0.059
## alternative hypothesis: two-sided

PS is higher in putative TFs but it is not significantly so.

Q5: what is the expression level of putative TFs?

Since so little genes are considered, I can look at the expression profile of each one! Perhaps the ones with the most variance?

Fs_TF <- 
        Fs_PES %>%
        dplyr::select(-c(gamete,matSP)) %>%
        dplyr::filter(GeneID %in% Fs_TF_list$GeneID) %>%
        tidyr::pivot_longer(!c(PS,GeneID), names_to = "Stage", values_to = "TPM")
Fs_TF$Stage <- base::factor(Fs_TF$Stage, unique(Fs_TF$Stage))
Fd_TF <- 
        Fd_PES %>%
        dplyr::select(-c(gamete,matSP)) %>%
        dplyr::filter(GeneID %in% Fd_TF_list$GeneID) %>%
        tidyr::pivot_longer(!c(PS,GeneID), names_to = "Stage", values_to = "TPM")
Fd_TF$Stage <- base::factor(Fd_TF$Stage, unique(Fd_TF$Stage))

We filter for genes with mean TPM below 0.5 and CV (coefficient of variance) over 1. This is fair since we are selecting the genes with the most expression variance.

Fs_TF_highCV <-
        Fs_TF %>% 
        dplyr::group_by(GeneID) %>%
        dplyr::summarise(variance = var(TPM), 
                         mean_TPM = mean(TPM)) %>%
        dplyr::filter(mean_TPM > 0.5) %>%
        dplyr::group_by(GeneID) %>%
        dplyr::summarise(CV = sqrt(variance)/mean_TPM) %>%
        dplyr::filter(CV > 1)
Fs_TF %>% 
        dplyr::filter(GeneID %in% Fs_TF_highCV$GeneID) %>%
        ggplot2::ggplot(aes(x = Stage, 
                            y = log2(TPM+1), 
                            group = GeneID,
                            colour = GeneID)) +
        ggplot2::geom_line(size = 2, alpha = 0.5) +
        ggplot2::labs(title = "Fucus serratus") +
        scale_colour_viridis_d()

Fd_TF_highCV <-
        Fd_TF %>% 
        dplyr::group_by(GeneID) %>%
        dplyr::summarise(variance = var(TPM), 
                         mean_TPM = mean(TPM)) %>%
        dplyr::filter(mean_TPM > 0.5) %>%
        dplyr::group_by(GeneID) %>%
        dplyr::summarise(CV = sqrt(variance)/mean_TPM) %>%
        dplyr::filter(CV > 1)
Fd_TF %>% 
        dplyr::filter(GeneID %in% Fd_TF_highCV$GeneID) %>%
        ggplot2::ggplot(aes(x = Stage, 
                            y = log2(TPM+1), 
                            group = GeneID,
                            colour = GeneID)) +
        ggplot2::geom_line(size = 2, alpha = 0.5) +
        ggplot2::labs(title = "Fucus distichus") +
        scale_colour_viridis_d()

> Fs_TF_highCV$GeneID
[1] "ser_TRINITY_DN11159_c0_g1" "ser_TRINITY_DN11241_c0_g2"
[3] "ser_TRINITY_DN16240_c0_g2" "ser_TRINITY_DN46003_c0_g1"
> Fd_TF_highCV$GeneID
[1] "dis_TRINITY_DN1365_c0_g3" "dis_TRINITY_DN329_c0_g1" 
[3] "dis_TRINITY_DN3687_c0_g2" "dis_TRINITY_DN7838_c0_g3"
[5] "dis_TRINITY_DN8105_c0_g3"

It appears dis_TRINITY_DN329_c0_g1 and dis_TRINITY_DN8105_c0_g3 share a similar expression profile to

source("functions/parse_orthogroups.R")

OG_tx2gene <- 
  parse_orthogroups("data/Results_Nov29/Orthogroups/Orthogroups.tsv") %>% 
  stack() %>% 
  tidyr::as_tibble() %>% 
  dplyr::rename(GeneID = values) %>% 
  dplyr::rename(OG = ind) %>%
  dplyr::relocate(GeneID) %>%
  dplyr::mutate(GeneID = str_remove(GeneID, "\\.p[0-9].*"))
> OG_tx2gene %>% 
+     dplyr::filter(GeneID %in% c(Fd_TF_highCV$GeneID, Fs_TF_highCV$GeneID))
# A tibble: 8 × 2
  GeneID                    OG       
  <chr>                     <fct>    
1 dis_TRINITY_DN329_c0_g1   OG0003154
2 dis_TRINITY_DN3687_c0_g2  OG0003246
3 dis_TRINITY_DN3687_c0_g3  OG0003246
4 ser_TRINITY_DN11159_c0_g1 OG0003687
5 dis_TRINITY_DN8105_c0_g3  OG0003946
6 ser_TRINITY_DN16240_c0_g2 OG0005153
7 dis_TRINITY_DN1365_c0_g3  OG0005332
8 ser_TRINITY_DN11241_c0_g2 OG0007518

These genes are not related… It is simply not easy to compare the expression of orthogroup genes.

Q5: what is the expression correlation when examining just putative TFs?

OG_filt <- 
        OG_tx2gene %>%
        dplyr::filter(GeneID %in% c(Fs_TF_list$GeneID, Fd_TF_list$GeneID))
Fs_abundance_filt <-
        readr::read_csv(file = "data/Fs_OG_abundance.csv") %>%
        dplyr::filter(OG %in% OG_filt$OG)
## Rows: 5596 Columns: 49
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): OG
## dbl (48): Fs_F_gametes_1, Fs_F_gametes_2, Fs_F_gametes_3, Fs_M_gametes_1, Fs...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Fd_abundance_filt <-
        readr::read_csv(file = "data/Fd_OG_abundance.csv") %>%
        dplyr::filter(OG %in% OG_filt$OG)
## Rows: 5410 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): OG
## dbl (34): Fd_f_gametes_1, Fd_f_gametes_2, Fd_f_gametes_3, Fd_m_gametes_1, Fd...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Many OGs are lost this way. We have 78 OGs in Fd (from 5410) and 77 in Fs (from 5596). By applying inner_join, we obtain just 59 OGs.

sample_info_fd <-
  readr::read_csv("data/sample_info_fd.csv")
## Rows: 34 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): SampleName, SeqProt, Species, stage_tissue, Stage, Tissue, Experime...
## dbl (1): Replicate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sample_info_fs <-
  readr::read_csv("data/sample_info_fs.csv")
## Rows: 48 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): SampleName, SeqProt, Species, stage_tissue, Stage, Tissue, Experime...
## dbl (1): Replicate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sample_info_fd_embryo <-
  sample_info_fd %>%
  dplyr::filter(Stage %in% c("E1", "E2", "E3", "E4", "E5", "E6")) %>%
  dplyr::mutate(LibLab = str_c(Species, "_", Stage, "_", Replicate))
sample_info_fs_embryo <-
  sample_info_fs %>%
  dplyr::filter(Stage %in% c("24H", "48H", "1w", "3w", "4w")) %>%
  dplyr::mutate(Replicate = case_when(
    LibName == "Fs_X_zygotes_48h_1" ~ 4,
    LibName == "Fs_X_zygotes_48h_2" ~ 5,
    LibName == "Fs_X_zygotes_48h_3" ~ 6,
    TRUE ~ Replicate
  )) %>%
  mutate(LibLab = str_c(Species, "_", Stage, "_", Replicate))

PCA using OGs

selectGenes <- function(counts, min.count=2){
  keep <-
    counts[rowMeans(counts)>min.count, ]
  return(keep)
}

To minimise differences between species, I remove OGs with low counts.

combined_metatable <- 
  rbind(sample_info_fs_embryo, sample_info_fd_embryo) %>%
  relocate(LibName)
merged_TPM_pre <- 
  merge(Fs_abundance_filt, Fd_abundance_filt, by = "OG") %>%
  dplyr::select(1, combined_metatable$LibName)
merged_TPM <- 
  data.frame(row.names = merged_TPM_pre[,1], merged_TPM_pre[,-1], check.names = FALSE)
merged_TPM.filt <-
  as.matrix(merged_TPM) %>%
  selectGenes(min.count=10)

10 OGs are lost after the TPM 10 filter (from 59 to 49).

log2_TPM <- log2(merged_TPM.filt+1)
sqrt_TPM <- sqrt(merged_TPM.filt)
tpm10_TPM <- (merged_TPM.filt > 10) * 1
combined_metatable$Stage <- factor(combined_metatable$Stage, levels = unique(combined_metatable$Stage))

myPCAfunction <- function(pcDat, data, x, y, ...){
  OUT <- 
    ggplot2::autoplot(
      pcDat,
      data = data,
      frame.colour = "Stage",
      colour = "Stage",
      shape = "Species",
      x = x,
      y = y,
      size = 3,
      frame = TRUE,
      alpha = 0.5,
      ...)
  return(OUT)
}
pcDat <- prcomp(t(log2_TPM), scale = TRUE)

PC1_2 <- myPCAfunction(pcDat, data = combined_metatable, x = 1, y = 2) +
  labs(title = "PC1 and PC2") +
  theme_grey()

PC1_3 <- myPCAfunction(pcDat, data = combined_metatable, x = 1, y = 3) +
  labs(title = "PC1 and PC3") +
  theme_grey()

log2_plot <- ggpubr::ggarrange(PC1_2, PC1_3, common.legend = T)
ggpubr::annotate_figure(log2_plot, top = ggpubr::text_grob("log2", 
                                                   face = "bold", size = 14))

pcDat <- prcomp(t(sqrt_TPM), scale = TRUE)
# pcDat <- prcomp(t(rlog_TPM))

PC1_2 <- myPCAfunction(pcDat, data = combined_metatable, x = 1, y = 2) +
  labs(title = "PC1 and PC2") +
  theme_grey()

PC1_3 <- myPCAfunction(pcDat, data = combined_metatable, x = 1, y = 3) +
  labs(title = "PC1 and PC3") +
  theme_grey()

sqrt_plot <- ggpubr::ggarrange(PC1_2, PC1_3, common.legend = T)
ggpubr::annotate_figure(sqrt_plot, top = ggpubr::text_grob("sqrt", 
                                                   face = "bold", size = 14))

# pcDat <- prcomp(t(tpm10_TPM)) # too many zeros to do a scale = TRUE
# 
# PC1_2 <- myPCAfunction(pcDat, data = combined_metatable, x = 1, y = 2) +
#   labs(title = "PC1 and PC2") +
#   theme_grey()
# 
# PC1_3 <- myPCAfunction(pcDat, data = combined_metatable, x = 1, y = 3) +
#   labs(title = "PC1 and PC3") +
#   theme_grey()
# 
# rlog_plot <- ggpubr::ggarrange(PC1_2, PC1_3, common.legend = T)
# ggpubr::annotate_figure(rlog_plot, top = ggpubr::text_grob("tpm10 threshold", 
#                                                    face = "bold", size = 14))

Plot with loadings

# with loading
pcDat <- prcomp(t(log2_TPM), scale = TRUE)

myPCAfunction(pcDat, data = combined_metatable, x = 1, y = 2, loadings = TRUE, loadings.colour = 'black', loadings.label.colour="black", loadings.label = TRUE, loadings.label.size = 2, loadings.label.repel=TRUE) +
  labs(title = "PC1 and PC2", subtitle = "log2 with loading") +
  theme_grey()
## Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

myPCAfunction(pcDat, data = combined_metatable, x = 1, y = 3, loadings = TRUE, loadings.colour = 'black', loadings.label.colour="black", loadings.label = TRUE, loadings.label.size = 2, loadings.label.repel=TRUE) +
  labs(title = "PC1 and PC3", subtitle = "log2 with loading") +
  theme_grey()

The general structure of the embryogenesis PCA is maintained when using putative TFs. Unfortunately, the expression level of only 49 OGs are captured here. The TPM10 threshold performs very poorly here so I have removed it.

Distance metrics

Here, we use distance metrics to quantify how distant the stages are from each other across Fucus species and maybe even in Ectocarpus.

library(philentropy)
ncol_Fs <- grep( "Fs" , colnames( log2_TPM ) )
ncol_Fd <- grep( "Fd" , colnames( log2_TPM ) )

Using log2(TPM+1)

log2_TPM_renamed <- log2_TPM
base::colnames(log2_TPM_renamed) <- combined_metatable$LibLab

Pearson correlation

M = cor(x = log2_TPM_renamed[,ncol_Fs], 
        y = log2_TPM_renamed[,ncol_Fd], 
        method = "pearson")

M_melt <- reshape2::melt(M)
tmp1 <- sample_info_fd_embryo %>% 
  select(Var2 = LibLab, Stage) %>% 
  full_join(M_melt, multiple = "all") %>%
  dplyr::rename(Fd = Stage)
## Joining with `by = join_by(Var2)`
tmp2 <- sample_info_fs_embryo %>% 
  select(Var1 = LibLab, Stage) %>% 
  full_join(tmp1, multiple = "all") %>%
  dplyr::rename(Fs = Stage)
## Joining with `by = join_by(Var1)`
tmp2$Fs <- factor(tmp2$Fs, levels = unique(tmp2$Fs))
tmp2$Fd <- factor(tmp2$Fd, levels = unique(tmp2$Fd))
tmp2 %>% group_by(Fs, Fd) %>% summarise(median_corr = median(value)) %>%
  ggplot(aes(x = Fs, y = Fd, fill = median_corr)) +
  geom_tile() +
  geom_text(aes(label = paste0(round(median_corr,3))), color = "black") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  scale_fill_gradient(low = "white", high = "#DC0000FF") +
  labs(x = "Fucus serratus samples",
       y = "Fucus distichus samples",
       title = "Pearson correlation (log2)",
       fill = "median(correlation)")
## `summarise()` has grouped output by 'Fs'. You can override using the `.groups`
## argument.

Spearman correlation

Monotonic relationship

log2_TPM_renamed <- log2_TPM
base::colnames(log2_TPM_renamed) <- combined_metatable$LibLab

M = cor(x = log2_TPM_renamed[,ncol_Fs], 
        y = log2_TPM_renamed[,ncol_Fd], 
        method = "spearman")

M_melt <- reshape2::melt(M)
tmp1 <- sample_info_fd_embryo %>% 
  select(Var2 = LibLab, Stage) %>% 
  full_join(M_melt, multiple = "all") %>%
  dplyr::rename(Fd = Stage)
## Joining with `by = join_by(Var2)`
tmp2 <- sample_info_fs_embryo %>% 
  select(Var1 = LibLab, Stage) %>% 
  full_join(tmp1, multiple = "all") %>%
  dplyr::rename(Fs = Stage)
## Joining with `by = join_by(Var1)`
tmp2$Fs <- factor(tmp2$Fs, levels = unique(tmp2$Fs))
tmp2$Fd <- factor(tmp2$Fd, levels = unique(tmp2$Fd))
tmp2 %>% group_by(Fs, Fd) %>% summarise(median_corr = median(value)) %>%
  ggplot(aes(x = Fs, y = Fd, fill = median_corr)) +
  geom_tile() +
  geom_text(aes(label = paste0(round(median_corr,3))), color = "black") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  scale_fill_gradient(low = "white", high = "#DC0000FF") +
  labs(x = "Fucus serratus samples",
       y = "Fucus distichus samples",
       title = "Spearman correlation (log2)",
       fill = "median(correlation)")
## `summarise()` has grouped output by 'Fs'. You can override using the `.groups`
## argument.

Manhattan distance

DM <- philentropy::distance(t(log2_TPM_renamed), use.row.names = TRUE, method = "manhattan")
## Metric: 'manhattan'; comparing: 34 vectors.
DM2 <- DM[ncol_Fs, ncol_Fd]

M_melt <- reshape2::melt(DM2)
tmp1 <- sample_info_fd_embryo %>% 
  select(Var2 = LibLab, Stage) %>% 
  full_join(M_melt, multiple = "all") %>%
  dplyr::rename(Fd = Stage)
## Joining with `by = join_by(Var2)`
tmp2 <- sample_info_fs_embryo %>% 
  select(Var1 = LibLab, Stage) %>% 
  full_join(tmp1, multiple = "all") %>%
  dplyr::rename(Fs = Stage)
## Joining with `by = join_by(Var1)`
tmp2$Fs <- factor(tmp2$Fs, levels = unique(tmp2$Fs))
tmp2$Fd <- factor(tmp2$Fd, levels = unique(tmp2$Fd))
tmp2 %>% group_by(Fs, Fd) %>% summarise(median_corr = median(value)) %>%
  ggplot(aes(x = Fs, y = Fd, fill = median_corr)) +
  geom_tile() +
  geom_text(aes(label = paste0(round(median_corr,0))), color = "black") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  scale_fill_gradient(low = "white", high = "#3C5488FF") +
  labs(x = "Fucus serratus samples",
       y = "Fucus distichus samples",
       title = "Manhattan distance (log2)",
       fill = "median(distance)")
## `summarise()` has grouped output by 'Fs'. You can override using the `.groups`
## argument.

JSD metric

mat_normalized <- apply(log2_TPM_renamed, 2, function(x) x/sum(x)) %>% as.data.frame()
# colSums(mat_normalized)
DM <- philentropy::distance(t(mat_normalized), use.row.names = TRUE, method = "jensen-shannon")
## Metric: 'jensen-shannon' using unit: 'log'; comparing: 34 vectors.
DM2 <- DM[ncol_Fs, ncol_Fd]
DM3 <- sqrt(DM2)

M_melt <- reshape2::melt(DM3)
tmp1 <- sample_info_fd_embryo %>% 
  select(Var2 = LibLab, Stage) %>% 
  full_join(M_melt, multiple = "all") %>%
  dplyr::rename(Fd = Stage)
## Joining with `by = join_by(Var2)`
tmp2 <- sample_info_fs_embryo %>% 
  select(Var1 = LibLab, Stage) %>% 
  full_join(tmp1, multiple = "all") %>%
  dplyr::rename(Fs = Stage)
## Joining with `by = join_by(Var1)`
tmp2$Fs <- factor(tmp2$Fs, levels = unique(tmp2$Fs))
tmp2$Fd <- factor(tmp2$Fd, levels = unique(tmp2$Fd))
tmp2 %>% group_by(Fs, Fd) %>% summarise(median_corr = median(value)) %>%
  ggplot(aes(x = Fs, y = Fd, fill = median_corr)) +
  geom_tile() +
  geom_text(aes(label = paste0(round(median_corr,3))), color = "black") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  scale_fill_gradient(low = "white", high = "#3C5488FF") +
  labs(x = "Fucus serratus samples",
       y = "Fucus distichus samples",
       title = "JSD metric (norm. log2)",
       fill = "median(distance)")
## `summarise()` has grouped output by 'Fs'. You can override using the `.groups`
## argument.

It is only the very early stages that have the most correlated expression putative TF marker. This is seen with the Pearson and Spearman correlations and can be analogously identified with distance-based approaches.

devtools::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
##  setting  value
##  version  R version 4.2.2 (2022-10-31)
##  os       macOS Big Sur ... 10.16
##  system   x86_64, darwin17.0
##  ui       X11
##  language (EN)
##  collate  en_US.UTF-8
##  ctype    en_US.UTF-8
##  tz       Europe/Berlin
##  date     2023-12-04
##  pandoc   3.1.6.2 @ /usr/local/bin/ (via rmarkdown)
## 
## ─ Packages ───────────────────────────────────────────────────────────────────
##  package      * version    date (UTC) lib source
##  abind          1.4-5      2016-07-21 [1] CRAN (R 4.2.0)
##  backports      1.4.1      2021-12-13 [1] CRAN (R 4.2.0)
##  bit            4.0.5      2022-11-15 [1] CRAN (R 4.2.0)
##  bit64          4.0.5      2020-08-30 [1] CRAN (R 4.2.0)
##  broom          1.0.5      2023-06-09 [1] CRAN (R 4.2.0)
##  bslib          0.5.1      2023-08-11 [1] CRAN (R 4.2.0)
##  cachem         1.0.8      2023-05-01 [1] CRAN (R 4.2.0)
##  callr          3.7.3      2022-11-02 [1] CRAN (R 4.2.0)
##  car            3.1-2      2023-03-30 [1] CRAN (R 4.2.0)
##  carData        3.0-5      2022-01-06 [1] CRAN (R 4.2.0)
##  cli            3.6.1      2023-03-23 [1] CRAN (R 4.2.0)
##  codetools      0.2-19     2023-02-01 [1] CRAN (R 4.2.0)
##  colorspace     2.1-0      2023-01-23 [1] CRAN (R 4.2.0)
##  cowplot        1.1.1      2020-12-30 [1] CRAN (R 4.2.0)
##  crayon         1.5.2      2022-09-29 [1] CRAN (R 4.2.0)
##  devtools       2.4.5      2022-10-11 [1] CRAN (R 4.2.0)
##  digest         0.6.33     2023-07-07 [1] CRAN (R 4.2.0)
##  dplyr        * 1.1.3      2023-09-03 [1] CRAN (R 4.2.0)
##  ellipsis       0.3.2      2021-04-29 [1] CRAN (R 4.2.0)
##  evaluate       0.22       2023-09-29 [1] CRAN (R 4.2.2)
##  fansi          1.0.5      2023-10-08 [1] CRAN (R 4.2.2)
##  farver         2.1.1      2022-07-06 [1] CRAN (R 4.2.0)
##  fastmap        1.1.1      2023-02-24 [1] CRAN (R 4.2.0)
##  fitdistrplus   1.1-11     2023-04-25 [1] CRAN (R 4.2.0)
##  forcats      * 1.0.0      2023-01-29 [1] CRAN (R 4.2.0)
##  foreach        1.5.2      2022-02-02 [1] CRAN (R 4.2.0)
##  fs             1.6.3      2023-07-20 [1] CRAN (R 4.2.0)
##  generics       0.1.3      2022-07-05 [1] CRAN (R 4.2.0)
##  ggfortify    * 0.4.16     2023-03-20 [1] CRAN (R 4.2.0)
##  ggplot2      * 3.4.4      2023-10-12 [1] CRAN (R 4.2.2)
##  ggpubr         0.6.0      2023-02-10 [1] CRAN (R 4.2.0)
##  ggrepel        0.9.4      2023-10-13 [1] CRAN (R 4.2.2)
##  ggsignif       0.6.4      2022-10-13 [1] CRAN (R 4.2.0)
##  glue           1.6.2      2022-02-24 [1] CRAN (R 4.2.0)
##  gridExtra      2.3        2017-09-09 [1] CRAN (R 4.2.0)
##  gtable         0.3.4      2023-08-21 [1] CRAN (R 4.2.0)
##  hms            1.1.3      2023-03-21 [1] CRAN (R 4.2.0)
##  htmltools      0.5.6.1    2023-10-06 [1] CRAN (R 4.2.2)
##  htmlwidgets    1.6.2      2023-03-17 [1] CRAN (R 4.2.0)
##  httpuv         1.6.11     2023-05-11 [1] CRAN (R 4.2.2)
##  iterators      1.0.14     2022-02-05 [1] CRAN (R 4.2.0)
##  jquerylib      0.1.4      2021-04-26 [1] CRAN (R 4.2.0)
##  jsonlite       1.8.7      2023-06-29 [1] CRAN (R 4.2.0)
##  knitr          1.44       2023-09-11 [1] CRAN (R 4.2.0)
##  labeling       0.4.3      2023-08-29 [1] CRAN (R 4.2.0)
##  later          1.3.1      2023-05-02 [1] CRAN (R 4.2.2)
##  lattice        0.21-9     2023-10-01 [1] CRAN (R 4.2.2)
##  lifecycle      1.0.3      2022-10-07 [1] CRAN (R 4.2.0)
##  lubridate    * 1.9.3      2023-09-27 [1] CRAN (R 4.2.0)
##  magrittr       2.0.3      2022-03-30 [1] CRAN (R 4.2.0)
##  MASS           7.3-60     2023-05-04 [1] CRAN (R 4.2.2)
##  Matrix         1.5-4.1    2023-05-18 [1] CRAN (R 4.2.0)
##  memoise        2.0.1      2021-11-26 [1] CRAN (R 4.2.0)
##  mime           0.12       2021-09-28 [1] CRAN (R 4.2.0)
##  miniUI         0.1.1.1    2018-05-18 [1] CRAN (R 4.2.0)
##  munsell        0.5.0      2018-06-12 [1] CRAN (R 4.2.0)
##  myTAI        * 1.0.1.9000 2023-10-03 [1] Github (drostlab/myTAI@e159136)
##  philentropy  * 0.7.0      2022-11-05 [1] CRAN (R 4.2.0)
##  pillar         1.9.0      2023-03-22 [1] CRAN (R 4.2.0)
##  pkgbuild       1.4.2      2023-06-26 [1] CRAN (R 4.2.0)
##  pkgconfig      2.0.3      2019-09-22 [1] CRAN (R 4.2.0)
##  pkgload        1.3.3      2023-09-22 [1] CRAN (R 4.2.0)
##  plyr           1.8.9      2023-10-02 [1] CRAN (R 4.2.2)
##  prettyunits    1.2.0      2023-09-24 [1] CRAN (R 4.2.0)
##  processx       3.8.2      2023-06-30 [1] CRAN (R 4.2.0)
##  profvis        0.3.8      2023-05-02 [1] CRAN (R 4.2.0)
##  promises       1.2.1      2023-08-10 [1] CRAN (R 4.2.2)
##  ps             1.7.5      2023-04-18 [1] CRAN (R 4.2.0)
##  purrr        * 1.0.2      2023-08-10 [1] CRAN (R 4.2.2)
##  R6             2.5.1      2021-08-19 [1] CRAN (R 4.2.0)
##  Rcpp           1.0.11     2023-07-06 [1] CRAN (R 4.2.0)
##  readr        * 2.1.4      2023-02-10 [1] CRAN (R 4.2.0)
##  remotes        2.4.2.1    2023-07-18 [1] CRAN (R 4.2.2)
##  reshape2       1.4.4      2020-04-09 [1] CRAN (R 4.2.0)
##  rlang          1.1.1      2023-04-28 [1] CRAN (R 4.2.0)
##  rmarkdown      2.25       2023-09-18 [1] CRAN (R 4.2.2)
##  rstatix        0.7.2      2023-02-01 [1] CRAN (R 4.2.0)
##  rstudioapi     0.15.0     2023-07-07 [1] CRAN (R 4.2.0)
##  sass           0.4.7      2023-07-15 [1] CRAN (R 4.2.0)
##  scales         1.2.1      2022-08-20 [1] CRAN (R 4.2.0)
##  sessioninfo    1.2.2      2021-12-06 [1] CRAN (R 4.2.0)
##  shiny          1.7.5.1    2023-10-14 [1] CRAN (R 4.2.2)
##  stringi        1.7.12     2023-01-11 [1] CRAN (R 4.2.0)
##  stringr      * 1.5.0      2022-12-02 [1] CRAN (R 4.2.0)
##  survival       3.5-7      2023-08-14 [1] CRAN (R 4.2.0)
##  tibble       * 3.2.1      2023-03-20 [1] CRAN (R 4.2.0)
##  tidyr        * 1.3.0      2023-01-24 [1] CRAN (R 4.2.0)
##  tidyselect     1.2.0      2022-10-10 [1] CRAN (R 4.2.0)
##  tidyverse    * 2.0.0      2023-02-22 [1] CRAN (R 4.2.0)
##  timechange     0.2.0      2023-01-11 [1] CRAN (R 4.2.0)
##  tximport     * 1.26.1     2022-12-16 [1] Bioconductor
##  tzdb           0.4.0      2023-05-12 [1] CRAN (R 4.2.2)
##  urlchecker     1.0.1      2021-11-30 [1] CRAN (R 4.2.0)
##  usethis        2.2.2      2023-07-06 [1] CRAN (R 4.2.0)
##  utf8           1.2.3      2023-01-31 [1] CRAN (R 4.2.0)
##  vctrs          0.6.4      2023-10-12 [1] CRAN (R 4.2.2)
##  viridisLite    0.4.2      2023-05-02 [1] CRAN (R 4.2.0)
##  vroom          1.6.4      2023-10-02 [1] CRAN (R 4.2.2)
##  withr          2.5.1      2023-09-26 [1] CRAN (R 4.2.0)
##  xfun           0.40       2023-08-09 [1] CRAN (R 4.2.2)
##  xtable         1.8-4      2019-04-21 [1] CRAN (R 4.2.0)
##  yaml           2.3.7      2023-01-23 [1] CRAN (R 4.2.0)
## 
##  [1] /Library/Frameworks/R.framework/Versions/4.2/Resources/library
## 
## ──────────────────────────────────────────────────────────────────────────────